---
title: "Binning Notebook"
output: html_notebook
---

This notebook calculates the optimal number of bins for each of the four
features used in the language-matching algorithm, and bins/scores the
training and testing data.

```{r}
library(BAMMtools) # for jenks breaks function in C
library(caret)
library(pbmcapply)
library(testit) # assertions
library(tidyverse)
set.seed(123)
```

To optimize the number of bins, we use repeated k-fold cross-validation for each
feature to determine the optimal number of breaks with the Jenks Breaks method.
This is selected by plotting the RMSE by number of bins and using the
elbow method.

This code runs repeated k-fold cross-validation for optimal binning using
pbmcmapply. This function runs mapply on 4 cores
(determined with `detectCores()`) in parallel and displays a progress bar with
an ETA. The result is then saved to `../data/cv_bins.rds`.
```{r}
train_df <- readRDS("../data/test_input_train.rds")
num_folds <- 10
num_repeats <- 3
max_num_bins <- 40
all_indices <- seq_len(nrow(train_df))
features <- c(
  "age",
  "spanish_address_score",
  "spanish_first_name_score",
  "spanish_surname_score"
)

generated_folds <- createMultiFolds(
  train_df$language,
  k = num_folds,
  times = num_repeats
)

df <- tibble(train_indices = generated_folds) %>%
  slice(rep(1:n(), each = length(features) * max_num_bins))
df <- cbind(df, tibble(num_bins = seq(1:max_num_bins)) %>%
                slice(rep(1:n(), each = length(features))) %>%
                slice(rep(row_number(), num_folds * num_repeats)))
df$feature <- rep_len(features, length.out = nrow(df))
```

```{r}
get_perf <- function(train_indices, num_bins, feature) {
    feature <- as.name(feature)
    test_indices <- setdiff(all_indices, train_indices)
    train_indices <- unlist(train_indices)
    cv_test_df <- train_df[test_indices, ]
    cv_train_df <- train_df[train_indices, ]
    min_val <- min(min(cv_train_df[[feature]]), min(cv_test_df[[feature]]))
    max_val <- max(max(cv_train_df[[feature]]), max(cv_test_df[[feature]]))
    breaks <- get_breaks(cv_train_df[[feature]], num_bins, min_val, max_val)
    train_predictions <- cv_train_df %>%
        mutate(
            bin = cut(!!feature, breaks, include.lowest = TRUE)
        ) %>%
        group_by(bin) %>%
        summarize(
            class_avg = mean(!!feature)
        )
    test_predictions <- cv_test_df %>%
        mutate(
            bin = cut(!!feature, breaks, include.lowest = TRUE)
        ) %>%
        left_join(train_predictions, by = c("bin"))
    return(RMSE(test_predictions$class_avg, test_predictions[[feature]]))
}

get_breaks <- function(data, num_bins, min_val, max_val) {
    breaks <- getJenksBreaks(data, num_bins + 1)
    breaks[1] <- min_val
    breaks[length(breaks)] <- max_val
    # Add jitter to avoid duplicate breaks.
    breaks <- breaks + (seq_along(breaks) - 1) * .Machine$double.eps
    assert(anyDuplicated(breaks) == 0)
    return(breaks)
}
```

```{r}
res <- pbmcmapply(
  get_perf, df$train_indices, df$num_bins, df$feature, mc.cores = 4
)
df <- cbind(df, res)
saveRDS(df, "../data/cv_bins_l2.rds")
```

To automate the elbow method, we take the number of bins that is farthest
from the line between the first and last point of the curve defined by the RMSE
and the number of bins.
```{r}
df <- readRDS("../data/cv_bins_l2.rds")
```

```{r}
# Equivalent to drawing a line between the first and last points of the curve
# and finding the data point farthest from that line.
elbow_finder_dist <- function(x_values, y_values) {
  # Max values to create line
  max_x_x <- max(x_values)
  max_x_y <- y_values[which.max(x_values)]
  max_y_y <- max(y_values)
  max_y_x <- x_values[which.max(y_values)]
  max_df <- data.frame(x = c(max_y_x, max_x_x), y = c(max_y_y, max_x_y))

  # Creating straight line between the max values
  fit <- lm(max_df$y ~ max_df$x)

  # Distance from point to line
  distances <- c()
  for (i in seq_len(length(x_values))) {
    distances <- c(
      distances,
      abs(coef(fit)[2] * x_values[i] - y_values[i] + coef(fit)[1]) / sqrt(
        coef(fit)[2]^2 + 1^2
      )
    )
  }

  # Max distance point
  x_max_dist <- x_values[which.max(distances)]

  return(x_max_dist)
}
```

```{r}
num_bins_by_feature <- tibble(
  feature = c(),
  bins_derivative = c(),
  bins_dist = c()
)
for (f in features) {
    rmse_per_bin <- df %>%
            filter(feature == f) %>%
            group_by(num_bins) %>%
            summarize(
                rmse = mean(res)
            )
    elbow_dist <- elbow_finder_dist(rmse_per_bin$num_bins, rmse_per_bin$rmse)
    print(
         rmse_per_bin %>%
            ggplot(aes(x = num_bins, y = rmse)) +
                geom_point() + geom_line() +
                geom_vline(xintercept = elbow_dist, color = "red") +
                xlab("Number of bins") +
                ylab("RMSE") +
                ggtitle(paste0("Elbow method for ", f))
    )
    num_bins_by_feature <- rbind(num_bins_by_feature, tibble(
        feature = c(f),
        bins_derivative = c(elbow_derivative),
        bins_dist = c(elbow_dist)
    ))
}
```

We can then bin each of the features by the optimal number of bins. In practice,
the number of individuals in the middle bin was too small for the last name
score feature, and we decided to use two bins instead of three as determined
by the algorithm.
```{r}
age_n <- num_bins_by_feature[
  which(num_bins_by_feature$feature == "age"), ]$bins_dist
addr_score_n <- num_bins_by_feature[
  which(num_bins_by_feature$feature == "spanish_address_score"), ]$bins_dist
first_name_score_n <- num_bins_by_feature[
  which(num_bins_by_feature$feature == "spanish_first_name_score"), ]$bins_dist
# last_name_score_n <- num_bins_by_feature[
# which(num_bins_by_feature$feature == "spanish_surname_score"), ]$bins_dist
last_name_score_n <- 2

age_breaks <- get_breaks(
  train_df$age,
  age_n,
  min(train_df$age),
  max(train_df$age)
)
addr_score_breaks <- get_breaks(
  train_df$spanish_address_score,
  addr_score_n,
  min(train_df$spanish_address_score),
  max(train_df$spanish_address_score)
)
first_name_score_breaks <- get_breaks(
  train_df$spanish_first_name_score,
  first_name_score_n,
  min(train_df$spanish_first_name_score),
  max(train_df$spanish_first_name_score)
)
last_name_score_breaks <- get_breaks(
  train_df$spanish_surname_score,
  last_name_score_n,
  min(train_df$spanish_surname_score),
  max(train_df$spanish_surname_score)
)

saveRDS(age_breaks, "../data/age_breaks_l2.rds")
saveRDS(addr_score_breaks, "../data/addr_score_breaks_l2.rds")
saveRDS(first_name_score_breaks, "../data/first_name_score_breaks_l2.rds")
saveRDS(last_name_score_breaks, "../data/last_name_score_breaks_l2.rds")
```

With bins for each individual feature, we then bin the training and test sets.
```{r}
train_df <- readRDS("../data/test_input_train_l2.rds")
test_df <- readRDS("../data/test_input_test_l2.rds")
age_breaks <- readRDS("../data/age_breaks_l2.rds")
addr_score_breaks <- readRDS("../data/addr_score_breaks_l2.rds")
first_name_score_breaks <- readRDS("../data/first_name_score_breaks_l2.rds")
last_name_score_breaks <- readRDS("../data/last_name_score_breaks_l2.rds")

binned_train_df <- train_df %>%
    mutate(
        age_bin = cut(age, age_breaks, include.lowest = TRUE),
        addr_score_bin = cut(
          spanish_address_score,
          addr_score_breaks,
          include.lowest = TRUE
        ),
        first_name_score_bin = cut(
          spanish_first_name_score,
          first_name_score_breaks,
          include.lowest = TRUE
        ),
        last_name_score_bin = cut(
          spanish_surname_score,
          last_name_score_breaks,
          include.lowest = TRUE
        ),
        is_spanish = as.numeric(toupper(language) == "SPANISH")
    )
saveRDS(binned_train_df, "../data/binned_train_df_l2.rds")

binned_test_df <- test_df %>%
    mutate(
        age_bin = cut(age, age_breaks, include.lowest = TRUE),
        addr_score_bin = cut(
          spanish_address_score,
          addr_score_breaks,
          include.lowest = TRUE
        ),
        first_name_score_bin = cut(
          spanish_first_name_score,
          first_name_score_breaks,
          include.lowest = TRUE
        ),
        last_name_score_bin = cut(
          spanish_surname_score,
          last_name_score_breaks,
          include.lowest = TRUE
        ),
        is_spanish = as.numeric(toupper(language) == "SPANISH")
    )
saveRDS(binned_test_df, "../data/binned_test_df_l2.rds")
```

